####################################################################################

#Making the bourgeoisie? Values, voice, and state-provided homeownership
#File edited 03/01/2022 to add attrition exploration 
#This file recodes variables and performs and saves the analyses for W1
#Figures and tables are created by other files
#See readme.txt for replication instructions

####################################################################################

rm(list=ls())
library(AER)
library(tidyverse)
library(stringi)
library(stringr)
library(readr)
library(readxl)
library(ggplot2)
library(lubridate)
library(sandwich)
library(ri)
library(RItools)
library(boot)
library(corrgram)
library(psych)
library(forcats)
library(estimatr)
library(xtable)
library(fastDummies)
library(DeclareDesign)
library(randomizr)
library(ANOVAreplication)
library(miceadds)
library(clubSandwich)
options(scipen=6)

setwd("~/Dropbox/Replication-Making-JoP/") #Set WD for replication
source("Code/functions.R") #auxiliary functions for analysis

####################################################################################
# Read W1 dataset and create variables                                      ########
####################################################################################

load("Data/surveyW1-Making.RData")
print(comment(survey))

# Define Compliance
survey$D_composite <- survey$ganhou_composite

# Create attitudes index
set.seed(1977)
attitudes_index_vars <- imp.mice(survey %>% select(tax_item,
                                                   red_abstract,
                                                   red_concrete,
                                                   effortbetter,
                                                   trustothers,
                                                   successalone,
                                                   moneyimportant,
                                                   govind, 
                                                   competition))
survey <- survey %>% bind_cols(
  attitudes_index = calculate_mean_effects_index(Z = survey$Z,
                                                 outcome_mat = attitudes_index_vars, greedy = TRUE))

# Create welfare index
set.seed(1977)
welfare_index_vars <- imp.mice(survey %>% select(placement,
                                                 pea,
                                                 inc_above1mw))
survey <- survey %>% bind_cols(
  welfare_index = calculate_mean_effects_index(Z = survey$Z,
                                              outcome_mat = welfare_index_vars, greedy = TRUE))

surveyw1 <- survey
rm(survey, welfare_index_vars, attitudes_index_vars)

####################################################################################
# Declare variables for W1 (controls and outcomes)                            ######
####################################################################################

# Declare labels for control and balance variables
pre.treat <-  c(
    "sexor", 
    "race_bin",
    "religiao_dummy",
    "criancas_moram",
    "sch_years",
    "cadunico",
    "formal",
    "av_formalinc_log")
  
bal.labels <- c(
    "Sex (male)",
    "Race (white)", 
    "Religion (any)",
    "Children (N)",
    "Schooling (years)",
    "Registry (Cadunico)",
    "Years in Formal Employment",
    "Avg. Formal Wages")

####################################################################################
# Declare outcomes for analysis                                               ######
####################################################################################

outcomes1 <- c(
  "attitudes_index",
  "self_index",
  "market_index",
  "red_index"
)
outcomes2 <- c(
  "welfare_index","placement",
  "pea",
  "inc_above1mw"
)  
outcomes3 <- c("hap_index")
outcomes4 <-  c("prosp_self") 

#These definitions are needed below, in different parts of the code
the.fam <- paste("outcomes",1:4,sep="")
outcomes <- do.call("c",sapply(paste("outcomes",1:4,sep=""),"get"))
out.class <- paste("H",1:4,sep="")
names(outcomes)<- NULL
out.selected <- c(1,5,9,10) #select the main outcomes  
the.labels <-  c("Attitudes Index","Individualism",
                 "Market Beliefs","Redistribution",
                 "Welfare Index","Placement","PEA","Inc.>1MW",
                 "Happiness",
                 "Expectations")

# Check whether labels are correct
main <- rep(0,length(outcomes))
main[out.selected] <- 1
var.labels<- data.frame(outcomes,the.labels,main)
print(var.labels)

# Print summary statistics 
sumtab_raw <- summary_stats(data=subset(surveyw1,select=c("Z","D_composite",pre.treat,outcomes,
                                                      "tax_item","red_abstract","red_concrete",
                                                      "effortbetter","trustothers","successalone","moneyimportant",
                                                      "govind","competition")))
#Label most important variables to facilitate reporting of the table
rownames(sumtab_raw)[3:20] <- c(bal.labels,the.labels)


#Re-scaling all outcomes
surveyw1[outcomes] <- scalev(outcomes,surveyw1)

#Standardized Descriptive Table
sumtab_std <- summary_stats(data=subset(surveyw1,select=c("Z","D_composite",pre.treat,outcomes)))

out.sum.W1 <- list(sumtab.raw=sumtab_raw, sumtab.std=sumtab_std)
comment(out.sum.W1) <- paste("Estimated in ",Sys.Date(),". Summary Stats ",sep="")
save(out.sum.W1,file=paste("Routputs/out-W1-sum.RData",sep=""))

####################################################################################
# Analysis of Pooled Effects                                                  ######
# Estimation (3 estimators ITT-lin, ITT-lm, CACE-lm; 4 variants in each)      ######
# (*) Preferred estimation                                                    ######              
# Summary tables (combining selected ITT and CACE)                            ######
####################################################################################

####  ITT-Lin: four variants  ####
#out.lin       & (without age and without controls, with lottery ID as covariate and survey weights) 
#out.linc.noage& (without age and with controls, with lottery ID as covariate and survey weights)
#out.lin.age   & (with age and without controls, with lottery ID as covariate and survey weights) 
#out.linc      & (with age and with controls, with lottery ID as covariate and survey weights) (*) 
tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("id_edital")
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lin, the.data=surveyw1))
rownames(tmp) <- outcomes
out.lin <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lin$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lin[get(the.fam[ii]),"HBp"] <- p.adjust(out.lin[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lin$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP


tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("id_edital",pre.treat)
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lin, the.data=surveyw1))
rownames(tmp) <- outcomes
out.linc.noage <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.linc.noage$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.linc.noage[get(the.fam[ii]),"HBp"] <- p.adjust(out.linc.noage[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}  
out.linc.noage$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP


tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("id_edital","idade")
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lin, the.data=surveyw1))
rownames(tmp) <- outcomes
out.lin.age <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lin.age$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lin.age[get(the.fam[ii]),"HBp"] <- p.adjust(out.lin.age[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lin.age$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP


tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("id_edital","idade",pre.treat)
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lin, the.data=surveyw1))
rownames(tmp) <- outcomes
out.linc <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family and excluding main outcome
out.linc$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.linc[get(the.fam[ii]),"HBp"] <- p.adjust(out.linc[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.linc$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP

#collect all results
out.itt.lin <- list(out.lin=out.lin,out.linc.noage=out.linc.noage,
                    out.lin.age=out.lin.age,out.linc=out.linc)  


####  ITT-Lm: four variants  #### 
#out.lm        & (without age and without controls, with lottery ID as fixed effect and survey weights) 
#out.lmc.noage & (without age and with controls, with lottery ID as fixed effect and survey weights)
#out.lm.age    & (with age and without controls, with lottery ID as fixed effect and survey weights) 
#out.lmc       & (with age and with controls, with lottery ID as fixed effect and survey weights) (*) 
tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= NULL
                        ,fe = "id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lm, the.data=surveyw1))
rownames(tmp) <- outcomes
out.lm <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lm$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lm[get(the.fam[ii]),"HBp"] <- p.adjust(out.lm[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lm$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP


tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c(pre.treat)
                        ,fe = "id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lm, the.data=surveyw1))
rownames(tmp) <- outcomes
out.lmc.noage <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmc.noage$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmc.noage[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmc.noage[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lmc.noage$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP


tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= "idade"
                        ,fe = "id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lm, the.data=surveyw1))
rownames(tmp) <- outcomes
out.lm.age <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lm.age$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lm.age[get(the.fam[ii]),"HBp"] <- p.adjust(out.lm.age[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lm.age$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP


tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("idade",pre.treat)
                        ,fe = "id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lm, the.data=surveyw1))
rownames(tmp) <- outcomes
out.lmc <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmc$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmc[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmc[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lmc$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP

#collect all results
out.itt.lm <- list(out.lm=out.lm,out.lmc.noage=out.lmc.noage,
                   out.lm.age=out.lm.age,out.lmc=out.lmc)  

####  CACE-Lm: four variants (D_composite) #### 
#out.lmcace        & (without age and without controls, with lottery ID as fixed effect and survey weights) 
#out.lmcacec.noage & (without age and with controls, with lottery ID as fixed effect and survey weights) 
#out.lmcace.age    & (with age and without controls, with lottery ID as fixed effect and survey weights) 
#out.lmcacec       & (with age and with controls, with lottery ID as fixed effect and survey weights) (*) 
tmp <- bind_rows(lapply(1:length(outcomes),  outcomes=outcomes, treat="Z"
                        ,compliance="D_composite" 
                        ,the.data=surveyw1
                        ,fe="id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_cace)) 
rownames(tmp) <- outcomes
out.lmcace <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmcace$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmcace[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmcace[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lmcace$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP

tmp <- bind_rows(lapply(1:length(outcomes),  outcomes=outcomes, treat="Z"
                        ,compliance="D_composite" 
                        ,the.data=surveyw1
                        ,ctrls=pre.treat
                        ,fe="id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_cace)) 
rownames(tmp) <- outcomes
out.lmcacec.noage <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmcacec.noage$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmcacec.noage[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmcacec.noage[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lmcacec.noage$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP

tmp <- bind_rows(lapply(1:length(outcomes),  outcomes=outcomes, treat="Z"
                        ,compliance="D_composite" 
                        ,the.data=surveyw1
                        ,ctrls="idade"
                        ,fe="id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_cace)) 
rownames(tmp) <- outcomes
out.lmcace.age <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmcace.age$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmcace.age[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmcace.age[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}  
out.lmcace.age$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP

tmp <- bind_rows(lapply(1:length(outcomes),  outcomes=outcomes, treat="Z"
                        ,compliance="D_composite" 
                        ,the.data=surveyw1
                        ,ctrls=c("idade",pre.treat)
                        ,fe="id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_cace)) 
rownames(tmp) <- outcomes
out.lmcacec <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmcacec$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmcacec[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmcacec[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}    
out.lmcacec$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP

out.cace.lm <- list(out.lmcace=out.lmcace,out.lmcacec.noage=out.lmcacec.noage,
                    out.lmcace.age=out.lmcace.age,out.lmcacec=out.lmcacec)  

### Collect all results (list of lists)
out.all <- list(out.itt.lm=out.itt.lm,out.itt.lin=out.itt.lin,
                out.cace.lm=out.cace.lm)
comment(out.all) <- paste("Estimated in ",Sys.Date(),". Four estimators, four variants in each.",sep="")
save(out.all,file=paste("Routputs/out-W1-estimates.RData",sep=""))

####################################################################################
# Estimate effects for Edital 17.2016 only                                 ########
####################################################################################

####  ITT-Lm: four variants  #### 
#out.lm        & (without age and without controls, with lottery ID as fixed effect and survey weights) 
#out.lmc.noage & (without age and with controls, with lottery ID as fixed effect and survey weights) 
#out.lm.age    & (with age and without controls, with lottery ID as fixed effect and survey weights) 
#out.lmc       & (with age and with controls, with lottery ID as fixed effect and survey weights) (*) 

lot17 <- surveyw1 %>% filter(id_edital == "edital17.2016")

tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= NULL
                        ,w=1/lot17$sampling_prob
                        ,FUN = est_lm_lot, the.data=lot17))
rownames(tmp) <- outcomes
out.lm <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lm$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lm[get(the.fam[ii]),"HBp"] <- p.adjust(out.lm[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lm$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP


tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c(pre.treat)
                        ,w=1/lot17$sampling_prob
                        ,FUN = est_lm, the.data=lot17))
rownames(tmp) <- outcomes
out.lmc.noage <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmc.noage$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmc.noage[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmc.noage[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lmc.noage$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP


tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= "idade"
                        ,w=1/lot17$sampling_prob
                        ,FUN = est_lm, the.data=lot17))
rownames(tmp) <- outcomes
out.lm.age <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lm.age$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lm.age[get(the.fam[ii]),"HBp"] <- p.adjust(out.lm.age[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lm.age$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP


tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("idade",pre.treat)
                        ,w=1/lot17$sampling_prob
                        ,FUN = est_lm, the.data=lot17))
rownames(tmp) <- outcomes
out.lmc <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmc$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmc[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmc[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lmc$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP

out.itt.lm17 <- list(out.lm=out.lm,out.lmc.noage=out.lmc.noage,
                     out.lm.age=out.lm.age,out.lmc=out.lmc)  

####################################################################################
# Estimate effects for Edital 20.2016 only                                 ########
####################################################################################

####  ITT-Lm: four variants  #### 
#out.lm        & (without age and without controls, with lottery ID as fixed effect and survey weights) 
#out.lmc.noage & (without age and with controls, with lottery ID as fixed effect and survey weights) 
#out.lm.age    & (with age and without controls, with lottery ID as fixed effect and survey weights) 
#out.lmc       & (with age and with controls, with lottery ID as fixed effect and survey weights) (*) 

lot20 <- surveyw1 %>% filter(id_edital == "edital20.2016")

tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= NULL
                        ,w=1/lot20$sampling_prob
                        ,FUN = est_lm_lot, the.data=lot20))
rownames(tmp) <- outcomes
out.lm <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lm$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lm[get(the.fam[ii]),"HBp"] <- p.adjust(out.lm[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lm$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP

tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c(pre.treat)
                        ,w=1/lot20$sampling_prob
                        ,FUN = est_lm, the.data=lot20))
rownames(tmp) <- outcomes
out.lmc.noage <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmc.noage$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmc.noage[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmc.noage[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lmc.noage$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP

tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= "idade"
                        ,w=1/lot20$sampling_prob
                        ,FUN = est_lm, the.data=lot20))
rownames(tmp) <- outcomes
out.lm.age <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lm.age$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lm.age[get(the.fam[ii]),"HBp"] <- p.adjust(out.lm.age[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lm.age$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP

tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("idade",pre.treat)
                        ,w=1/lot20$sampling_prob
                        ,FUN = est_lm, the.data=lot20))
rownames(tmp) <- outcomes
out.lmc <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmc$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmc[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmc[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.lmc$HBp[which(main==1)] <- NA #main outcomes are not subject to correction per PAP

out.itt.lm20 <- list(out.lm=out.lm,out.lmc.noage=out.lmc.noage,
                     out.lm.age=out.lm.age,out.lmc=out.lmc)  

### Collect all lottery by lottery results  (in a list of lists)
out.all_bylot <- list(out.itt.lm17=out.itt.lm17, out.itt.lm20=out.itt.lm20)
comment(out.all_bylot) <- paste("Estimated in ",Sys.Date(),". Four estimators, four variants in each.",sep="")
save(out.all_bylot,file=paste("Routputs/out-W1-estimates-bylot.RData",sep=""))

####################################################################################
# Moderation by Wait Time in Pooled Analysis                                               ######
# Estimating one model (lm with controls, age, and fixed effects)
####################################################################################

# Describe all moderators used in paper
moderators <- c("treatwait")

# Check, first, "balance" on long/short wait (Implements an F-test) 
pre.treat <- pre.treat
the.formula <- formula(paste("treatwait~",paste(pre.treat,collapse="+")))
the.treated <- subset(surveyw1,Z=1)
the.treated$treatwait <- the.treated$treatwait=="Z_2waiting"
m1 <- lm_robust(the.formula,
                data = the.treated, clusters = fieldID,
                weights = 1/the.treated$sampling_prob,
                se_type = "stata")
#Standard F-test
the.wald <- lmtest::waldtest(m1,  test = c("F"))
print(the.wald)

ballmr <- bind_rows(lapply(1:length(pre.treat), outcomes=pre.treat,
                           FUN = est_lm,
                           treat="treatwait",
                           ctrls = NULL,
                           clusters = "fieldID",
                           the.data=the.treated))
rownames(ballmr) <- pre.treat
out.mod.ballmr <- list(out.bal.wait=ballmr)
comment(out.mod.ballmr) <- paste("Estimated in ",Sys.Date(),". Balance Het Treat Effect.",sep="")
save(out.mod.ballmr,file=paste("Routputs/out-W1-bal-moderators.RData",sep=""))

#now the actual moderation
tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,treat="treatwait"
                        ,fe="id_edital"
                        ,ctrls= c("idade",pre.treat)
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = reg_hetfactor, the.data=surveyw1))
rownames(tmp) <- outcomes
out.wait <- as.data.frame(tmp)

### Save moderation results (in list of lists)
out.mod.all <- list(out.wait=out.wait)
comment(out.mod.all) <- paste("Estimated in ",Sys.Date(),". One speficication for each model.",sep="")
save(out.mod.all,file=paste("Routputs/out-W1-estimates-moderators.RData",sep=""))

####################################################################################
# Balance Tests by Lottery                                       
####################################################################################

# Declare labels for control and balance variables
controls <- pre.treat

the.lotteries <- unique(surveyw1$id_edital)
# Loop over lotteries  #
bal.stats <- list() #object to store balance results
for(ii in 1:length(the.lotteries)){ 
  print(the.lotteries[ii])
  lot <- subset(surveyw1,id_edital==the.lotteries[ii])
  #Implments an F-test
  the.formula <- formula(paste("Z~idade+",paste(pre.treat,collapse="+")))
  m1 <- lm_robust(the.formula, 
                  data = lot, 
                  weights = 1/sampling_prob,
                  se_type = "stata")
  m0 <- lm_robust(Z~idade, 
                  data = lot, 
                  weights = 1/sampling_prob,
                  se_type = "stata")
  #Standard F-test 
  the.wald <- lmtest::waldtest(m1, m0,  test = c("F"))
  print(the.wald)
  
  # Compute the p-value of the F-tests based on the permutation distribution of W
  W_obs <- lmtest::waldtest(m1, m0,  test = c("F"))[2,3] #store f-test for bootstrapped p-values 
  N <- nrow(lot)
  sims <- 1000
  n_treat <- as.numeric(table(lot$Z)[2])
  W_sims <- numeric(sims)
  set.seed(30033)
  for(i in 1:sims){
    lot$Z_sim <- complete_ra(N, n_treat)
    fit_sim1 <- lm_robust(formula(paste("Z_sim~idade+",paste(pre.treat,collapse="+"))), 
                          data = lot, 
                          weights = 1/sampling_prob, 
                          se_type = "stata")
    fit_sim0 <- lm_robust(Z_sim ~ idade, data = lot, weights = 1/sampling_prob,  
                          se_type = "stata")
    W_sims[i] <- lmtest::waldtest(fit_sim1, fit_sim0,  test = c("F"))[2,3]
  }
  p.permutation <- mean(W_sims >= W_obs)
  # Bootrsapped p-value o F-test
  cat("Simulated p-value:",p.permutation,"\n")   
  
  # Compute balance variable by variable controlling for age
  ballm <- bind_rows(lapply(1:length(pre.treat), outcomes=c(pre.treat)
                            , ctrls = "idade"
                            , FUN = est_lin, the.data=lot
                            , w = 1/lot$sampling_prob)) 
  rownames(ballm) <- pre.treat
  print(ballm)
  # Compute balance variable by variable controlling for age
  ballmnoage <- bind_rows(lapply(1:length(pre.treat), outcomes=c(pre.treat)
                                 , FUN = est_lm, the.data=lot
                                 , w = 1/lot$sampling_prob)) 
  rownames(ballmnoage) <- pre.treat
  print(ballmnoage)
  #assemble objects with balance stats
  bal.stats[[the.lotteries[ii]]] <- list(N=table(lot$Z),wald=the.wald
                                         ,p.permutation=p.permutation
                                         ,balage=ballm
                                         ,balnoage=ballmnoage)
} #end loop for lottery
save(bal.stats,file=paste("Routputs/out-W1-balancebylottery.RData",sep=""))

####################################################################################
# Balance in pooled set                                                      ###########
####################################################################################

pol.bal.stats <- list() #object to store balance results
#Implments an F-test
the.formula <- formula(paste("Z~idade+",paste(pre.treat,collapse="+")))
m1 <- lm_robust(the.formula, 
                data = surveyw1, clusters = fieldID,
                fixed_effects =~id_edital, 
                weights = 1/sampling_prob,
                se_type = "stata")
m0 <- lm_robust(Z~idade, 
                data = surveyw1, clusters = fieldID,
                fixed_effects =~id_edital,
                weights = 1/sampling_prob,
                se_type = "stata")
#Standard F-test 
ftest <-  lmtest::waldtest(m1, m0,  test = c("F"))
print(ftest)

# Compute the p-value of the F-tests based on the permutation distribution of W
W_obs <- lmtest::waldtest(m1, m0,  test = c("F"))[2,3] #store f-test for boostrapped p-values 
N <- nrow(surveyw1)
sims <- 1000
n_treat <- as.numeric(table(surveyw1$Z)[2])
W_sims <- numeric(sims)
set.seed(30033)
for(i in 1:sims){
  surveyw1$Z_sim <- complete_ra(N, n_treat)
  fit_sim1 <- lm_robust(formula(paste("Z_sim~idade+",paste(pre.treat,collapse="+"))), 
                        data = surveyw1,
                        cluster = fieldID, 
                        fixed_effects =~id_edital,
                        se_type = "stata",
                        weights = 1/sampling_prob)
  fit_sim0 <- lm_robust(Z_sim ~ idade, 
                        data = surveyw1, 
                        weights = 1/sampling_prob,  
                        cluster = fieldID, 
                        fixed_effects =~id_edital,
                        se_type = "stata")
  W_sims[i] <- lmtest::waldtest(fit_sim1, fit_sim0,  test = c("F"))[2,3]
}
p.permutation <- mean(W_sims >= W_obs)
# Bootrsapped p-value o F-test
cat("Simulated p-value:",p.permutation,"\n")

# Compute balance variable by variable controlling for age
ballm <- bind_rows(lapply(1:length(pre.treat), outcomes=pre.treat
                          , FUN = est_lin
                          , ctrls = c("idade","id_edital")
                          , clusters = "fieldID"
                          , w=1/surveyw1$sampling_prob
                          , the.data=surveyw1))
rownames(ballm) <- pre.treat
print(ballm)
# Compute balance variable by variable controlling without controlling for age  
ballmnoage <- bind_rows(lapply(1:length(pre.treat), outcomes=pre.treat
                               , FUN = est_lin
                               , ctrls = c("id_edital")
                               , clusters = "fieldID"
                               , w=1/surveyw1$sampling_prob
                               , the.data=surveyw1))
rownames(ballmnoage) <- pre.treat
print(ballmnoage)
#assemble objects with balance stats
pol.bal.stats <- list(N=table(surveyw1$Z),wald=ftest,p.permutation=p.permutation,
                      balage=ballm,
                      balnoage=ballmnoage)
save(pol.bal.stats,file=paste("Routputs/out-W1-balancepooled.RData",sep=""))


####################################################################################
# Attrition                                                           ################
# A different data set (of attempts) is loaded used                   ################
####################################################################################

#Data for attrition analysis
load("Data/W1_attrition_overall_public.Rda")
load("Data/W1_attrition_public.Rda")

### Answered by treatment and control (overall contacted -- may or may not have answered)
attrition_overall <- lm_lin(interviewed ~ treat_ind, cluster = fieldID,
                            se_type = "stata", covariates = ~ id_edital,
                            weights = 1/sampling_prob,
                            data = W1_attrition_overall)

attrition_overall_fe <- lm_robust(interviewed ~ treat_ind, cluster = fieldID,
                            se_type = "stata", fixed_effects = ~ id_edital,
                            weights = 1/sampling_prob,
                            data = W1_attrition_overall)

### Answered by treatment and control (among those who answered)
attrition_answered <- lm_lin(interviewed ~ treat_ind, cluster = fieldID,
                             se_type = "stata", covariates = ~ id_edital,
                             weights = 1/sampling_prob,
                             data = W1_attrition)

attrition_answered_fe <- lm_robust(interviewed ~ treat_ind, cluster = fieldID,
                                  se_type = "stata", fixed_effects = ~ id_edital,
                                  weights = 1/sampling_prob,
                                  data = W1_attrition)

##### Remove missing to perform Wald Test
W1_attrition_overall2 <- W1_attrition_overall %>% filter(!is.na(sexor))
W1_attrition2 <- W1_attrition %>% filter(!is.na(sexor))

#Including interactions of covariates
attrition_overall_covi <- lm_robust(interviewed ~ treat_ind + formal_pre_prop + sexor + av_prel + 
                                      treat_ind*formal_pre_prop + treat_ind*sexor + treat_ind*av_prel, 
                                    se_type = "stata", fixed_effects = ~ id_edital, clusters = fieldID,
                                    weights = 1/sampling_prob,
                                    data = W1_attrition_overall2)

attrition_answered_covi <- lm_robust(interviewed ~ treat_ind + formal_pre_prop + sexor + av_prel + 
                                       treat_ind*formal_pre_prop + treat_ind*sexor + treat_ind*av_prel, 
                                     se_type = "stata", fixed_effects = ~ id_edital, clusters = fieldID,
                                     weights = 1/sampling_prob,
                                     data = W1_attrition2)

### No interactions
attrition_overall_cov <- lm_robust(interviewed ~ treat_ind + formal_pre_prop + sexor + av_prel,
                                   se_type = "stata", fixed_effects = ~ id_edital, clusters = fieldID,
                                   weights = 1/sampling_prob,
                                   data = W1_attrition_overall2)

attrition_answered_cov <- lm_robust(interviewed ~ treat_ind + formal_pre_prop + sexor + av_prel,
                                    se_type = "stata", fixed_effects = ~ id_edital, clusters = fieldID,
                                    weights = 1/sampling_prob,
                                    data = W1_attrition2)

#Joint hypotheses test
wald_interviewed <- waldtest(attrition_overall_covi, attrition_overall_cov)
wald_interviewed_pickedup <- waldtest(attrition_answered_covi, attrition_answered_cov)

attrition.stats <- list(wald_interviewed = wald_interviewed, 
                        wald_interviewed_pickedup = wald_interviewed_pickedup, 
                        attrition_overall = attrition_overall, 
                        attrition_overall_fe = attrition_overall_fe, 
                        attrition_answered = attrition_answered, 
                        attrition_answered_fe = attrition_answered_fe)
                        
save(attrition.stats,file=paste("Routputs/out-W1-attrition.RData",sep=""))


####################################################################################
# Exporatory unplanned analysis
# Outcomes are the individual items in attitude indices                       ######
# Estimating only two variants of each estimator,                             ######
# Estimation (3 estimators ITT-lin, ITT-lm, CACE-lm)                          ######
# Summary tables (combining selected ITT and CACE)                            ######
####################################################################################

outcomesu1 <- c(
  "effortbetter",
  "trustothers",
  "successalone",
  "moneyimportant")

outcomesu2 <- c(
  "tax_item",
  "red_abstract",
  "red_concrete")

outcomesu3 <- c(
  "govind",
  "competition")

w1.attitudes <- subset(surveyw1,select=c(outcomesu1,outcomesu2,outcomesu3,pre.treat
                                         ,"fieldID","Z","D_composite","id_edital"))
save(w1.attitudes,file="Routputs/data-w1-attitudes.RData")

uG <- length(grep("outcomesu",ls()))
the.fam <- paste("outcomesu",1:uG,sep="")
outcomes <- do.call("c",sapply(paste("outcomesu",1:uG,sep=""),"get"))
out.class <- paste("H",1:uG,sep="")
names(outcomes)<-NULL
out.selected <- c(1,5,8)
names(outcomes)<-NULL
the.labels <-  c("Effort Better", "Trust Others", "Success Alone",
                 "Money Important",
                 "Tax Item", "Redistribution (Abstract)",
                 "Redistribution (Concrete)","Govt vs. Individual",
                 "Comp. is Good"
)

# Check whether labels are correct
print(data.frame(outcomes,the.labels))

# Print summary statistics
sumtab <- summary_stats(data=subset(surveyw1,select=c("Z","D_composite",pre.treat,outcomes))) 
print(round(sumtab,2))

#Re-scaling outcomes
surveyw1[outcomes] <- scalev(outcomes,surveyw1) 

####  ITT-Lin: two variants only for unplanned outcomes  ####
#out.lin       & (no age and no controls, edital as covariate and survey weights)
#out.linc      &  (with age and controls, edital as a covariate and survey weights)(*)
tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("id_edital")
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lin, the.data=surveyw1))
rownames(tmp) <- outcomes
out.lin <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]
#Compute HB corrected p-values by hypothesis family
out.lin$HBp <- NA
for(ii in 1:length(the.fam)){
  out.lin[get(the.fam[ii]),"HBp"] <- p.adjust(out.lin[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}

tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("id_edital","idade",pre.treat)
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lin, the.data=surveyw1))
rownames(tmp) <- outcomes
out.linc <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]
#Compute HB corrected p-values by hypothesis family
out.linc$HBp <- NA
for(ii in 1:length(the.fam)){
  out.linc[get(the.fam[ii]),"HBp"] <- p.adjust(out.linc[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}

out.itt.un.lin <- list(out.lin=out.lin,out.linc=out.linc)

####  ITT-Lm: two variants  ####
#out.lm        & (no age and no controls, edital as fixed effect and survey weights)
#out.lmc       & (with age and controls, edital as a fixed effect and survey weights)(*)
tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= NULL
                        ,fe = "id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lm, the.data=surveyw1))
rownames(tmp) <- outcomes
out.lm <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]
#Compute HB corrected p-values by hypothesis family
out.lm$HBp <- NA
for(ii in 1:length(the.fam)){
  out.lm[get(the.fam[ii]),"HBp"] <- p.adjust(out.lm[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}

tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("idade",pre.treat)
                        ,fe = "id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_lm, the.data=surveyw1))
rownames(tmp) <- outcomes
out.lmc <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]
#Compute HB corrected p-values by hypothesis family
out.lmc$HBp <- NA
for(ii in 1:length(the.fam)){
  out.lmc[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmc[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}

out.itt.un.lm <- list(out.lm=out.lm,out.lmc=out.lmc)

####  CACE-Lm: two variants (D_composite) ####
#out.lmcace        & (no age and no controls, edital as fixed effect and survey weights)
#out.lmcacec       & (with age and controls, edital as a fixed effect and survey weights)(*)

tmp <- bind_rows(lapply(1:length(outcomes),  outcomes=outcomes, treat="Z"
                        ,compliance="D_composite" #remember to check with "D" too!
                        ,the.data=surveyw1
                        ,fe="id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_cace))
rownames(tmp) <- outcomes
out.lmcace <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]
#Compute HB corrected p-values by hypothesis family
out.lmcace$HBp <- NA
for(ii in 1:length(the.fam)){
  out.lmcace[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmcace[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}

tmp <- bind_rows(lapply(1:length(outcomes),  outcomes=outcomes, treat="Z"
                        ,compliance="D_composite" #remember to check with "D" too!
                        ,the.data=surveyw1
                        ,ctrls=c("idade",pre.treat)
                        ,fe="id_edital"
                        ,clusters="fieldID"
                        ,w=1/surveyw1$sampling_prob
                        ,FUN = est_cace))
rownames(tmp) <- outcomes
out.lmcacec <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]
#Compute HB corrected p-values by hypothesis family
out.lmcacec$HBp <- NA
for(ii in 1:length(the.fam)){
  out.lmcacec[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmcacec[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}

out.cace.un.lm <- list(out.lmcace=out.lmcace,out.lmcacec=out.lmcacec)

### Collect all results (list of lists)
out.un.all <- list(out.itt.un.lm=out.itt.un.lm,
                   out.itt.un.lin=out.itt.un.lin,
                   out.cace.un.lm=out.cace.un.lm)
comment(out.un.all) <- paste("Estimated in ",Sys.Date(),". Two estimators, two variants in each.",sep="")
save(out.un.all,file=paste("Routputs/out-W1-estimates-unplanned.RData",sep=""))

# ####################### Additional exploratory analysis about attrition 
# ####################### Not shown in paper or appendix
# 
# ####### Describing attrition in W1
# 
# table(W1_attrition_overall$interviewed)
# prop.table(table(W1_attrition_overall$interviewed)*100)
# #About 10% of responded with recorded outcomes
# 
# #### Describing attrition by treatment arm
# 
# prop.table(table(W1_attrition_overall$interviewed, W1_attrition_overall$treat_ind), 2)*100
# 
# #Overall, small differences across treatment arms, less than 1%.
# 
# #### Describing attrition by treatment arm within block
# 
# prop.table(table(W1_attrition_overall$interviewed, W1_attrition_overall$TT), 2)*100
# 
# #Differences are small by treatment arm with blocks. 
# #In 17-2016, greater response in treatment compared to control
# #In 20-2016 greater response in control compared to treatment
# 
# #Edital 17/2016 carries a lot of weight in the attitudes outcome analysis,
# #we can conduct trimming bounds within this edital. For this exercise, we assume ITRs for this
# #edital and equal probability of assignment to treatment.
# 
# surveyw1 <- surveyw1 %>% mutate(ipw = 1/sampling_prob)
# surveyw1_ed17 <- surveyw1 %>% filter(id_edital == "edital17.2016")
# dat <- W1_attrition_overall %>% filter(id_edital == "edital17.2016")
# 
# Fail <- 1 - dat$interviewed #missing data indicator
# Z <- dat$treat_ind #treatment vector
# Weight <- rep(1, length(Fail)) #Number of observations
# 
# #Calculate Q
# 
# f0 <- sum(Weight[Fail == 0 & Z == 0]) / sum(Weight[Z == 0]) #attrition in control
# f1 <- sum(Weight[Fail == 0 & Z == 1]) / sum(Weight[Z == 1]) #attrition in treatment
# 
# Q <- (f1 - f0)/f1 #8.115929 of the units
# 
# #Number of units in treatment to be removed
# table(surveyw1_ed17$Z)
# 235*Q 
# #need to trim 19
# dataf <- surveyw1_ed17 %>% select(attitudes_index, Z, id_edital, ipw, fieldID) %>% filter(id_edital == "edital17.2016")
# 
# #For outcome attitudes
# bounds <- function(datad, d){
#   
#   dataff <- datad[d, ]  
#   datafsort <- dataff[order(dataff$attitudes_index), ]
#   OutS1 <- datafsort[datafsort$Z == 1, ]
#   OutS1trimmedlower <- OutS1[-c(1:19),]
#   OutS1trimmedupper <- OutS1[-c(216:235),]
#   OutS0 <- datafsort[datafsort$Z == 0, ]
#   
#   #Upper bound
#   high_est = weighted.mean(OutS1trimmedlower$attitudes_index, w = OutS1trimmedlower$ipw) - weighted.mean(OutS0$attitudes_index, w = OutS0$ipw)
#   
#   #Lower bound
#   low_est = weighted.mean(OutS1trimmedupper$attitudes_index, w = OutS1trimmedupper$ipw) - weighted.mean(OutS0$attitudes_index, w = OutS0$ipw)
#   
#   return(c(low_est = low_est, high_est = high_est))
#   
# }
# 
# #Getting CIs
# set.seed(343)
# bootstrap_df <- boot(data = dataf, statistic = bounds,
#                      R=1000)
# bootstrap_df
# sd(bootstrap_df$t[,1])
# sd(bootstrap_df$t[,2])
# quantile(bootstrap_df$t[,1], c(0.025, 0.975))
# quantile(bootstrap_df$t[,2], c(0.025, 0.975))
# 



